Alexander Feldman V.1.1
This project investigate user behavior for The gym chain Model Fitness.
The goals of the project:
#!pip install plotly --upgrade
# import libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly import graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import KMeans
# open the dataset
try:
data = pd.read_csv('/datasets/gym_churn_us.csv', sep = ',') # path for working on the platform
except:
data = pd.read_csv('datasets/gym_churn_us.csv', sep = ',') # path for local working
data.head()
# change of type and round of the data
data.columns = data.columns.str.lower()
data['avg_additional_charges_total'] = data['avg_additional_charges_total'].round(2)
data['month_to_end_contract'] = data['month_to_end_contract'].astype(int)
# Find missing values.
display(data.info(), data.describe())
There aren't missing values. Columns 'Avg_additional_charges_total' and 'Lifetime' are not normally distributed.
# Look at the mean feature values in churn and stayed groups.
group_avg = data.groupby('churn', as_index=False).mean()
group_avg
As we can see from the table some average features have a significant difference between groups: Partner, Promo friends, Contract period, Group visits, Lifetime and so on.
# count values of churn column
churn_count = data.groupby('churn')['gender'].count().reset_index()
churn_count = churn_count.rename(columns={'churn':'status', 'gender':'n_users'})
# Plot bar of distributed of churned and stayed users
fig = px.pie(churn_count, values='n_users', names=['Stayed users', 'Churned users'],
color_discrete_sequence=px.colors.qualitative.Set2)
fig.update_traces(textinfo='value + percent')
fig.update_layout(title={'text':'Churned users and Stayed users', 'x':0.5})
fig.show()
As you can see from the pie, 26.5% of users are going to give up services. This is a fairly large churn rate. The business has clear problems.
# plot distribotions of features for stayed and churned users
data_feats = data.drop('churn', axis=1)
fig = make_subplots(rows=5, cols=3, subplot_titles=("Gender", "Near location", "Partners program",
"Friends promo", "Is there a phone?", "Contract period",
"Group visits", "Age", "Total additional charges",
"Months to end contract", "Lifetime", "Visits frequency total",
"Visits frequency in current month"))
col=1
row=1
statuses=['Stayed', 'Churned']
colors=px.colors.qualitative.Set2
for label, content in data_feats.items():
for ch in range(0,2):
status = statuses[ch]
color = colors[ch+2]
if label == 'gender':
fig.add_trace(go.Histogram(x=data.query('churn == @ch')[label],
name=status, text=status,
marker={'color':color}), row, col)
fig.update_xaxes(tickvals = [0,1], ticktext=['Female','Male'] , row=row, col=col)
else:
fig.add_trace(go.Histogram(x=data.query('churn == @ch')[label], name=status,
text=status, showlegend=False,
marker={'color':color}), row, col)
if label in ['near_location', 'partner', 'promo_friends', 'phone', 'group_visits']:
fig.update_xaxes(tickvals = [0,1], ticktext=['No','Yes'], row=row, col=col)
if label in ['contract_period', 'month_to_end_contract']:
fig.update_xaxes(tickvals = [1,6,12], row=row, col=col)
if col<3:
col+=1
elif col==3:
col=1
row+=1
fig.update_layout(title={'text':'Distibutions of features for stayed and churned users', 'x':0.5},
height=1500, bargap=0.1, barmode='group')
fig.show()
As rule, churned users have short contracts and lifetime, don't part of partners and promo programs, have a lower frequency of visits.
#Build a correlation matrix
plt.figure(figsize=(12,8))
ax = sns.heatmap(data_feats.corr(), cmap="Purples", annot=True)
plt.title('Correlation matrix of the features', fontdict={'size':15})
plt.show()
As we can see from the heatmap there is a high correlation between the contract period and months to end the contract (97%), between the total average visits frequency and the current month average visits frequency (95%). We will account for it when predict.
You can also notice that marketing programs (partners and friends) have a middle correlation (45%). Users are likely to try one program and appreciate the benefits and than taking part in another. Moreover, the promo activity of users depends on the length of the contract (31% of correlation for partners program).
# Write a function for forecasting user churn by logistic regression and random forest methods
def predict_user_churn(X,y):
# prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
scaler = StandardScaler()
X_train_st = scaler.fit_transform(X_train)
X_test_st = scaler.transform(X_test)
# train and predict by Logistic Regression model
lr_model = LogisticRegression(random_state=0)
lr_model.fit(X_train_st, y_train)
y_prediction_lr = lr_model.predict(X_test_st)
# train and predict by Random Forest model
rf_model = RandomForestClassifier(n_estimators = 100, random_state=0)
rf_model.fit(X_train_st, y_train)
y_prediction_rf = rf_model.predict(X_test_st)
# print result
result = pd.DataFrame(data={'Accuracy':[accuracy_score(y_test, y_prediction_lr),
accuracy_score(y_test, y_prediction_rf)],
'Precision':[precision_score(y_test, y_prediction_lr),
precision_score(y_test, y_prediction_rf)],
'Recall':[recall_score(y_test, y_prediction_lr),
recall_score(y_test, y_prediction_rf)]},
index=['Logistic Regression', 'Random Forest'])
return result.style.format('{:.2f}')
# Build a model to predict user churn by logistic regression and random forest methods.
predict_user_churn(X = data.drop('churn', axis=1),
y = data['churn'])
# Build a model to predict user churn again with accounting of correlation of features.
# Since we have a high correlation between 2 pairs of columns, rid of those to avoid multicollinearity.
predict_user_churn(X = data.drop(['churn','month_to_end_contract', 'avg_class_frequency_current_month'], axis=1),
y = data['churn'])
As we can see from the first table, the Logistic Regression method shows higher accuracy, precision and recall than Random Forest method.
With the exclusion of highly correlated fields, we see that the accuracy of the Logistic Regression becomes equal to that of the Random Forest method.
Thus, of all the options for building models, the most successful is the Logistic Regression method without eliminating multicollenarity.
# build a matrix of distances
X = data.drop('churn', axis=1)
sc = StandardScaler()
X_sc = sc.fit_transform(X)
linked = linkage(X_sc, method = 'ward')
# plot a dendrogram
plt.figure(figsize=(12, 8))
dendrogram(linked, orientation='top', no_labels=True)
plt.title('Hierarchical clustering for Model Fitness customers ')
plt.gca().spines["top"].set_alpha(0.0)
plt.gca().spines["bottom"].set_alpha(0.5)
plt.gca().spines["right"].set_alpha(0.0)
plt.gca().spines["left"].set_alpha(0.5)
plt.show()
As we can see from the plot we got 4 different groups of users by linkage algorithm.
# Train the clustering model with the K-means algorithm and predict customer clusters
km = KMeans(n_clusters = 5, random_state=0)
labels = km.fit_predict(X_sc)
data['cluster'] = labels
# Look at the mean feature values for clusters
mean_feature_cluster = data.groupby('cluster', as_index=False).mean()
mean_feature_cluster.round(2)
# Plot distributions of features for the clusters
feat_cluster = data.drop('cluster', axis=1)
fig = make_subplots(rows=5, cols=3,
subplot_titles=("Gender", "Near location", "Partners program",
"Friends promo", "Is there a phone?", "Contract period",
"Group visits", "Age", "Total additional charges",
"Months to end contract", "Lifetime","Visits frequency total",
"Visits frequency in current month", "Churn users"))
col=1
row=1
colors = px.colors.qualitative.Set2
for label, content in feat_cluster.items():
for clust in range(0,5):
status = 'Cluster '+str(clust)
color = colors[clust]
if label == 'gender':
fig.add_trace(go.Histogram(x=data.query('cluster == @clust')[label],
name=status, text=status,
marker={'color':color}), row, col)
fig.update_xaxes(tickvals = [0,1], ticktext=['Female','Male'] , row=row, col=col)
else:
fig.add_trace(go.Histogram(x=data.query('cluster == @clust')[label], name=status,
text=status, showlegend=False,
marker={'color':color}), row, col)
if label in ['near_location', 'partner', 'promo_friends', 'phone', 'group_visits']:
fig.update_xaxes(tickvals = [0,1], ticktext=['No','Yes'], row=row, col=col)
if label in ['contract_period', 'month_to_end_contract']:
fig.update_xaxes(tickvals = [1,6,12], row=row, col=col)
if label in 'churn':
fig.update_xaxes(tickvals = [0,1], ticktext=['Stay','Churn'], row=row, col=col)
if col<3:
col+=1
elif col==3:
col=1
row+=1
fig.update_layout(title={'text':'Distibutions of features by clusters', 'x':0.5},
height=1500, bargap=0.1, barmode='stack', legend_traceorder="normal")
fig.show()
We got a division of clients into 5 clusters. Note that it would be advisable to divide them into 4 clusters, as shown by the dendrogram, but the task indicated the division into 5 clusters.
So, let's sort the clusters according to the degree of customer retention and give characteristics to each:
# Calculate the churn rate for each cluster
group_cr = data.groupby('cluster', as_index=False).agg({'churn':['sum','count']})
group_cr.columns=['cluster','churned_users','n_users']
group_cr['churn_rate'] = group_cr.apply(lambda x:x[1]/x[2], axis=1).round(2)
group_cr['cluster'] = group_cr['cluster'].astype(str)
fig = px.bar(group_cr, x="churn_rate", y="cluster", color='cluster',
orientation='h', text='churn_rate',
color_discrete_sequence=px.colors.qualitative.Set2)
fig.update_traces(texttemplate='-%{text:.0%}', textposition='auto')
fig.update_layout(title={'text':'Churn rate by clusters', 'x':0.5})
fig.show()
So, we conducted an analysis, the results of which can help us identify the key factors affecting customer churn, as well as develop measures to increase user retention.
Based on their characteristics of clusters, we can distinguish parameters that may indicate a quick departure of the client:
Many users from the groups with the maximum CR demonstrate the following behavior: they buy the minimum contract, go once a week, refuse group visits and, as a result, leave after a month.
Hypotheses why users leave:
Recommendations for the marketing department: